/*c_clean_nsc_data.do**********************************************************

	Clean NSC data for easier use in future analysis

******************************************************************************/
set more off


************************ Clean NSC Data ****************************************
use "$stata_data_nsc/full_results.dta", clear

*Pull information from search request
gen studentno = substr(requesterreturnfield,1,6)
destring studentno, replace
gen year = substr(requesterreturnfield,8,4)
gen source = substr(requesterreturnfield,7,1)
destring year, replace
rename firstname fname
rename lastname lname
rename middleinitial mi
rename namesuffix sname

*Label college outcomes
order studentno
gen public = publicprivate == "Public"
gen private = publicprivate == "Private"
gen grad = graduated=="Y"
gen outofstate = collegestate!="MA" & collegestate != ""
gen instate = collegestate == "MA"
gen any = recordfound=="Y"
gen any4yr = year4year=="4"
gen any2yr = year4year=="2"
gen still_enroll = enrollmentend >= 20200400 & !missing(enrollmentend)

*Convert some variables to time
tostring enrollmentend, gen(enrollmentend_str)
tostring enrollmentbegin, gen(enrollmentbegin_str)
gen enrollmentend_date = date(enrollmentend_str, "YMD")
format enrollmentend_date %td
gen enrollmentbegin_date = date(enrollmentbegin_str, "YMD")
format enrollmentbegin_date %td
replace enrollmentend_str = "" if enrollmentend_str == "."

/* We use maxing and egen procedure since we don't want to collapse the dataset.
This is because we want to keep the first observation for each student
as it will be used for later analysis */

*Track time enrolled and semester enrolled
gen time_enrolled_sem = enrollmentend_date - enrollmentbegin_date
gen begin_mi = mi(enrollmentbegin)
gen end_mi = mi(enrollmentend)
bysort fname lname mi dob (begin_mi end_mi): egen time_enrolled = total(time_enrolled_sem)
gen yrs_enrolled = time_enrolled/365
bysort fname lname mi dob (begin_mi end_mi): gen num_sem_temp = _n
bysort fname lname mi dob (begin_mi end_mi): egen num_sem = max(num_sem_temp)
replace num_sem = 0 if missing(num_sem)
drop num_sem_temp begin_mi end_mi

*For enrollmentend, find the year associated with the first graduation
bysort fname lname mi dob grad (graduationdate): gen enrollmentend_nsc_temp = graduationdate[1] if grad == 1
bysort fname lname mi dob: egen enrollmentend_nsc = min(enrollmentend_nsc_temp)

*If we can't find this, then we look at the last date of enrollment
gsort fname lname mi dob -enrollmentend
bysort fname lname mi dob: replace enrollmentend_nsc = enrollmentend[1] if enrollmentend_nsc == .

*Record if first college graduated from is 4 year
sort fname lname mi dob -grad graduationdate collegesequence
by fname lname mi dob: gen first_grad_4yr = any4yr[1]

*Graduation analysis but for 4 year universities
bysort fname lname mi dob grad any4yr (graduationdate): gen enrollmentend_nsc_temp_4yr = graduationdate[1] if grad == 1 & any4yr == 1
bysort fname lname mi dob: egen enrollmentend_nsc_4yr = min(enrollmentend_nsc_temp_4yr)
gsort fname lname mi dob -enrollmentend
bysort fname lname mi dob: replace enrollmentend_nsc_4yr = enrollmentend[1] if enrollmentend_nsc_4yr == .

*For beginning enrollment, just consider the first day of enrollment
local min_interest enrollmentbegin
foreach result of varlist `min_interest' {
	bysort fname lname mi dob: egen `result'_nsc = min(`result')
}


*Generate ever attending result
local result_interest outofstate any any4yr any2yr instate public private grad
foreach result of varlist `result_interest' {

	*Check if they ever attended the relevant types
	bysort fname lname mi dob: egen ever`result'_nsc = max(`result')

	*Condition on 4years
	bysort fname lname mi dob: egen ever`result'_nsc_4yr = max(`result'*any4yr)

}

*Save the full data set
save "$stata_data_nsc/full_nsc.dta", replace

*Keep only used variables
keep studentno fname mi lname sname collegecodebranch collegename collegestate ///
		year4year publicprivate enrollmentbegin enrollmentbegin_str enrollmentend ///
		enrollmentend_str enrollmentstatus dob public private grad outofstate ///
		instate any any4yr any2yr still_enroll enrollmentend_date enrollmentbegin_date ///
		time_enrolled_sem time_enrolled yrs_enrolled num_sem ///
 		enrollmentend_nsc first_grad_4yr enrollmentend_nsc_4yr enrollmentbegin_nsc ///
  	everoutofstate_nsc everoutofstate_nsc_4yr everany_nsc everany_nsc_4yr ///
		everany4yr_nsc everany4yr_nsc_4yr everany2yr_nsc everany2yr_nsc_4yr ///
		everinstate_nsc everinstate_nsc_4yr everpublic_nsc everpublic_nsc_4yr ///
		everprivate_nsc everprivate_nsc_4yr evergrad_nsc evergrad_nsc_4yr collegesequence

*Keep the earliest college observation to track first school of enrollment
sort fname lname mi dob sname studentno -any collegesequence enrollmentbegin -time_enrolled_sem, stable
by fname lname mi dob: gen order = _n
keep if order == 1

*Rename variables for easier future usage.
local result_interest any4yr outofstate any any2yr instate public private
foreach result of varlist `result_interest' {
	rename `result' `result'_nsc
	gen `result'_nsc_4yr = `result'_nsc * any4yr_nsc

}

*Create the preknsc version where everyone has unique names
sort *
save "$stata_data_nsc/preknscclean_names.dta", replace
